
"""
Imputation script for TRA layer
--------------------------------
Requires: pandas, numpy, fuzzywuzzy, geopy
Inputs: raw_bulletins.csv, country_languages.csv, twb_distribution.csv
Outputs: tra_imputed.csv
"""

import pandas as pd, numpy as np
from fuzzywuzzy import fuzz
from geopy.geocoders import Nominatim

rng = np.random.default_rng(seed=42)
raw = pd.read_csv('raw_bulletins.csv')

# 1. Language tag imputation
lang_lookup = pd.read_csv('country_languages.csv')
dist = pd.read_csv('twb_distribution.csv')

def impute_lang(row):
    if pd.isna(row['language']):
        country = row['country']
        choices = dist[dist['country']==country]['iso'].tolist()
        if choices:
            return rng.choice(choices)
        # Fallback: official languages
        return lang_lookup[lang_lookup['country']==country]['iso'].iloc[0]
    return row['language']

raw['language_imputed'] = raw.apply(impute_lang, axis=1)

# 2. Geo‑coordinate imputation
geolocator = Nominatim(user_agent='tra_imputer')
def geocode_loc(row):
    if pd.isna(row['lat']):
        try:
            loc = geolocator.geocode(row['locality'] + ', ' + row['country'], timeout=10)
            return pd.Series({'lat':loc.latitude, 'lon':loc.longitude})
        except:
            return pd.Series({'lat':np.nan, 'lon':np.nan})
    return pd.Series({'lat':row['lat'], 'lon':row['lon']})

coords = raw.apply(geocode_loc, axis=1)
raw[['lat','lon']] = coords

# 3. Deployment size
raw['deploy_n'] = raw['team_size'].fillna(1)

# Save
raw.to_csv('tra_imputed.csv', index=False)
print('Imputation complete.')
